#!/bin/bash

# Copyright 2015 Guoguo Chen
# Apache 2.0

# Aligns data with CNTK models.

# Begin configuration section.  
nj=4
cmd=run.pl
stage=0
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
cntk_forward_opts=
cntk_config=
device=-1
parallel_opts=
num_threads=1
feature_transform=NO_FEATURE_TRANSFORM
feat_dim=
# End configuration options.

[ $# -gt 0 ] && echo "$0 $@"  # Print the command line for logging

[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;

if [ $# != 4 ]; then
   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_ali"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

data=$1
lang=$2
srcdir=$3
dir=$(readlink -f $4)

mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;

# Handles parallelization.
if [ $num_threads -gt 1 -a -z "$parallel_opts" ]; then
  parallel_opts="--num-threads $num_threads"
fi
cntk_forward_opts="$cntk_forward_opts numThreads=$num_threads"

# Checks files.
kaldi_model=$srcdir/final.mdl
cntk_model=$srcdir/cntk.mdl

for f in $sdata/1/feats.scp $sdata/1/text $srcdir/tree \
  $lang/L.fst $cntk_model $kaldi_model $cntk_config; do
  [ ! -f $f ] && echo "$0: missing file $f" && exit 1;
done
if [ $feature_transform != "NO_FEATURE_TRANSFORM" ]; then
  [ ! -f $feature_transform ] &&\
    echo "$0: missing file $feature_transform" && exit 1;
fi

cp -L $srcdir/tree $dir || exit 1;
cp -L $cntk_model $dir || exit 1;
cp -L $kaldi_model $dir || exit 1;

mkdir $dir/configs
cp -f $cntk_config $dir/configs/Align.cntk

# Features to be fed to CNTK.
feats="scp:$sdata/JOB/feats.scp"
feats_one="scp:$sdata/1/feats.scp"
if [ -z $feat_dim ]; then feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; fi
label_dim=$(am-info $kaldi_model 2>/dev/null | grep "pdfs" | awk '{print $4;}') || exit 1;

cntk_input_counts="$sdata/JOB/input_cntk.counts"
cntk_input_feats="$sdata/JOB/input_cntk_feats.scp"

$cmd JOB=1:$nj $dir/log/split_input_counts.JOB.log \
  feat-to-len "$feats" ark,t:"$cntk_input_counts" || exit 1;
$cmd JOB=1:$nj $dir/log/make_input_feats.JOB.log \
  echo scp:$sdata/JOB/feats.scp \> $cntk_input_feats || exit 1;

# Features to be generated by CNTK.
cntk_feats="cntk $cntk_forward_opts featureTransform=$feature_transform"
cntk_feats="$cntk_feats ExpDir=$dir configFile=$cntk_config DeviceNumber=$device"
cntk_feats="$cntk_feats modelName=$cntk_model labelDim=$label_dim featDim=$feat_dim"
cntk_feats="$cntk_feats inputCounts=$cntk_input_counts inputFeats=$cntk_input_feats |"

echo "$0: aligning data '$data' using $cntk_model"

# Map oovs in reference transcription, 
oov=`cat $lang/oov.int` || exit 1;
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
if [ $stage -le 0 ]; then
  train_graphs="ark:compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst '$tra' ark:- |"
  $cmd $parallel_opts JOB=1:$nj $dir/log/align.JOB.log \
    compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \
    align-compiled-mapped $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl ark:- \
    "ark:$cntk_feats" "ark,scp:$dir/ali.JOB.ark,$dir/ali.JOB.scp" || exit 1;
fi

cat $dir/ali.*.scp > $dir/ali.scp || exit 1;

echo "$0: done aligning data."
